# ------------------- #
#    Impute data      #
# ------------------- #

## Clear working space
rm(list = ls())

## Load in data
qog                   <- haven::read_dta("data/qog_std_ts_jan20.dta")
polity                <- readxl::read_excel("data/p5v2018d.xls")
full.data             <- get({data(cow_nmc)})
civil_cow             <- haven::read_dta("data/intra_state_wars_cow.dta")
state.names           <- read.csv("data/states2016.csv")
adjacency_matricies   <- readRDS("intl_order/adjacency_matricies_replication.rds")
intl_groups           <- readRDS("intl_order/international_order_replication.rds")
network_layers        <- readRDS("intl_order/network_layers_replication.rds")
vdem                  <- readRDS("data/V-Dem-CY-Core-v12.rds")

## Subset vdem data
vdem_keep <- dplyr::select(
  vdem, 
  c(
    COWcode, year, v2x_polyarchy, v2csgender_ord, v2csrlgrep_ord, 
    v2mecenefm_ord, v2clacjust_ord
  )
) %>% distinct(
  COWcode, year, .keep_all = T
)


## Subset QOG data
qog_democ_spread <- dplyr::select(
  qog, year, ccodecow, gle_cgdpc, wdi_gdpcapgr, ht_colonial, bl_asyf, bl_asym
) %>% distinct(
  year, ccodecow, .keep_all = T
)

## Turn polity data into yearly data
polity <- dplyr::select(
  polity,
  c(
    "scode", "country", "ccode", 
    "bmonth", "bday", "byear", 
    "emonth", "eday", "eyear",
    "democ", "autoc", "polity", 
    "xrreg", "xrcomp", "xropen", "xconst", 
    "parreg", "parcomp", "exrec", "exconst", "polcomp"
  )
) %>% mutate(
  ## Recode data for merge
  ccode = ifelse(country == "Montenegro", 341, ccode),
  ccode = ifelse(country == "Pakistan", 770, ccode),
  ccode = ifelse(country == "USSR", 365, ccode),
  ccode = ifelse(country == "Ethiopia", 530, ccode),
  ccode = ifelse(country == "South Sudan", 626, ccode),
  ccode = ifelse(country == "Vietnam", 816, ccode),
  ccode = ifelse(country == "Gran Colombia", 100, ccode),
  ccode = ifelse(country == "Serbia", 345, ccode),
  ccode = ifelse(country == "Sardinia", 325, ccode),
  eyear = ifelse(eyear == 9999, max(network_layers$year), eyear)
) %>% filter(
  eyear >= min(network_layers$year),
  !(
    democ < -10 & autoc < -10 & polity < -10 & xrreg < -10 & xrcomp < -10 & 
      xropen < -10 & xconst < -10 & parreg < -10 & parcomp < -10 &   exrec < -10 & 
      exconst < -10 & polcomp < -10)
) 

## Check if any are missing in the merge
missing_ccode  <- unique(c(polity$ccode)[!(c(polity$ccode) %in% c(network_layers$ccode1, network_layers$ccode2))])
missing_states <- filter(
  polity, 
  ccode %in% missing_ccode 
) %>% distinct(
  scode, country, ccode, byear
) %>% arrange(
  country
)

## Check missing for recode
check_dyads <- filter(
  network_layers, 
  grepl(paste0(missing_states$country, collapse = "|"), cowname1, ignore.case = T) |
    grepl(paste0(missing_states$country, collapse = "|"), cowname1, ignore.case = T)
) %>% mutate(
  country = ifelse(
    grepl(paste0(missing_states$country, collapse = "|"), cowname1, ignore.case = T), cowname1, cowname2
  ),
  ccode = ifelse(
    grepl(paste0(missing_states$country, collapse = "|"), cowname1, ignore.case = T), ccode1, ccode2
  )
) %>% distinct(
  ccode, country
) %>% arrange(
  ccode
)

# Fix Sundan name
check_sudan <- filter(
  network_layers, 
  grepl("sudan", cowname1, ignore.case = T) | 
    grepl("sudan", cowname2, ignore.case = T),
  year >= 2011
) %>% mutate(
  country = ifelse(
    grepl("sudan", cowname1, ignore.case = T), cowname1, cowname2
  ),
  ccode = ifelse(
    grepl("sudan", cowname1, ignore.case = T), ccode1, ccode2
  )
) %>% distinct(
  ccode, country
) %>% arrange(
  ccode
) %>% filter(
  grepl("sudan", country, ignore.case = T)
)

## Fix Colombia name
check_columbia <- filter(
  network_layers, 
  grepl("colombia", cowname1, ignore.case = T) | 
    grepl("colombia", cowname2, ignore.case = T)
) %>% mutate(
  country = ifelse(
    grepl("colombia", cowname1, ignore.case = T), cowname1, cowname2
  ),
  ccode = ifelse(
    grepl("colombia", cowname1, ignore.case = T), ccode1, ccode2
  )
) %>% distinct(
  ccode, country
) %>% arrange(
  ccode
) %>% filter(
  grepl("colombia", country, ignore.case = T)
)

## Create Polity data by year
polity_ccode <- unique(polity$ccode)
polity_long  <- data.frame() 
for (i in 1:length(polity_ccode))
{
  fix_data <- filter(
    polity, 
    ccode == polity_ccode[i]
  )
  
  country_temp <- list()
  for (z in 1:nrow(fix_data))
  {
    country_temp[[z]] <- data.frame(
      ccode = polity_ccode[i],
      scode = fix_data$scode[1],
      country = fix_data$country[1],
      year    = fix_data$byear[z]:fix_data$eyear[z], 
      democ   =  fix_data$democ[z], 
      autoc   =  fix_data$autoc[z],
      polity  =  fix_data$polity[z], 
      xrreg   =  fix_data$xrreg[z],
      xrcomp  =  fix_data$xrcomp[z],
      xropen  =  fix_data$xropen[z],
      xconst  =  fix_data$xconst[z], 
      parreg  =  fix_data$parreg[z], 
      parcomp =  fix_data$parcomp[z], 
      exrec   =  fix_data$exrec[z], 
      exconst =  fix_data$exconst[z],
      polcomp =  fix_data$polcomp[z],
      bmonth = fix_data$bmonth[z],
      emonth = fix_data$emonth[z],
      orig_year = fix_data$byear[z],
      eyear = fix_data$eyear[z]
    ) %>% filter(
      year >= 1816
    )
  }
  country_temp <- do.call(bind_rows, country_temp)
  repeat_years <- mutate(
    country_temp,
    n = 1
  ) %>% aggregate(
    data = ., 
    n ~ year, FUN = "sum"
  ) %>% filter(
    n > 1
  ) %>% left_join(
    country_temp,
    by = "year"
  ) %>% dplyr::select(
    year, orig_year, n
  ) %>% group_by(
    year
  ) %>% filter(
    orig_year == min(orig_year)
  ) %>% ungroup(.)
  
  country_temp <- mutate(
    country_temp, 
    democ = ifelse(year %in% repeat_years$year, NA, democ), 
    autoc = ifelse(year %in% repeat_years$year, NA, autoc), 
    polity = ifelse(year %in% repeat_years$year, NA, polity), 
    xrreg = ifelse(year %in% repeat_years$year, NA, xrreg), 
    xrcomp = ifelse(year %in% repeat_years$year, NA, xrcomp), 
    xropen = ifelse(year %in% repeat_years$year, NA, xropen), 
    xconst = ifelse(year %in% repeat_years$year, NA, xconst), 
    parreg = ifelse(year %in% repeat_years$year, NA, parreg), 
    parcomp = ifelse(year %in% repeat_years$year, NA, parcomp), 
    exrec = ifelse(year %in% repeat_years$year, NA, exrec), 
    exconst = ifelse(year %in% repeat_years$year, NA, exconst),
    polcomp = ifelse(year %in% repeat_years$year, NA, polcomp)
  ) %>% distinct(
    ccode, year, .keep_all = T
  )
  
  
  country_temp <- dplyr::select(
    country_temp, 
    -c(
      bmonth, emonth, eyear, orig_year
    )
  )
  
  polity_long <- bind_rows(
    polity_long, 
    country_temp
  ) 
}

## Fix country code
polity_long <- mutate(
  polity_long, 
  ccode = ifelse(ccode == 305 & year <= 1918, 300, ccode)
)

## Check missing codes
missing_ccode  <- unique(c(network_layers$ccode1, network_layers$ccode2))[!(unique(c(network_layers$ccode1, network_layers$ccode2)) %in% c(polity_long$ccode))]


## Check missing states
missing_states <- mutate(
  network_layers,
  country = ifelse(
    ccode1 %in% missing_ccode, cowname1, NA
  ),
  country = ifelse(
    ccode2 %in% missing_ccode, cowname2, country
  ),
  
  country = ifelse(
    ccode1 %in% missing_ccode & is.na(country), cowc1, country
  ),
  country = ifelse(
    ccode2 %in% missing_ccode  & is.na(country), cowc2, country
  ),
  
  ccode = ifelse(
    ccode1 %in% missing_ccode, ccode1, NA
  ),
  ccode = ifelse(
    ccode2 %in% missing_ccode, ccode2, ccode2
  )
) %>% distinct(
  ccode, country, year
) %>% arrange(
  ccode
) %>% filter(
  ccode %in% missing_ccode
)

sort(unique(missing_states$country))

## Bring in missing state data again
polity.df <- bind_rows(
  polity_long, 
  missing_states
)

## Fix other names that are wrong
other_missing <- mutate(
  network_layers, 
  ccode = ccode1, 
  country = cowname1
) %>% dplyr::select(
  ccode, country, year
)

other_missing <- mutate(
  network_layers, 
  ccode = ccode2, 
  country = cowname2
) %>% dplyr::select(
  ccode, country, year
) %>% bind_rows(
  other_missing
) %>% distinct(
  ccode, country, year
) %>% filter(
  !(paste0(ccode, "-", year) %in% paste0(polity.df$ccode, "-", polity.df$year))
)

## Merge back in
polity.df <- bind_rows(
  polity.df, 
  other_missing
)


## Merge to other variables
full.data <- left_join(
  polity.df,
  full.data, 
  by = c(
    "ccode", "year"
  )
)

## Merge in network layers
regions <- mutate(
  network_layers, 
  ccode = ccode1, 
  region = region1
) 

## Merge in regions
regions <- mutate(
  network_layers, 
  ccode = ccode2, 
  region = region2
) %>% bind_rows(
  regions 
) %>% distinct(
  ccode, region
)

## Bring data together
full.data <- left_join(
  full.data, 
  regions, 
  by = c("ccode")
) 

full.data <- left_join(
  full.data, 
  vdem_keep,
  by = c(
    "ccode" = "COWcode", "year"
  )
)

## Data for imputation
polity.nmc.vdem <- dplyr::select(
  full.data,
  c(
    scode, ccode, year, country, polity, democ, 
    autoc, xrreg, xrcomp, xropen, xconst, 
    parreg, parcomp, exrec, exconst, polcomp, 
    milex, milper, irst, pec, tpop, upop, cinc,
    v2x_polyarchy, v2csgender_ord, v2csrlgrep_ord, v2mecenefm_ord, v2clacjust_ord,
    region
  )
) 

## Set number missing values to actually missing
for (i in 1:ncol(polity.nmc.vdem))
{
  if (is.numeric(polity.nmc.vdem[,i]))
  {
    polity.nmc.vdem[,i][polity.nmc.vdem[,i] < -10] <- NA
  }
}

## Call this order data
order_data <- dplyr::rename(
  intl_groups,
  id = interlayer
) %>% 
  mutate(
    id = as.character(id),
    year = as.numeric(as.character(year))
  )

## Bring in the data
polity.nmc.vdem <- 
  left_join(
    polity.nmc.vdem,
    order_data,
    by = c("ccode", "year")
  )

## Subset to only relevant data
polity.nmc.vdem <- filter(
  polity.nmc.vdem,
  paste0(ccode, "-", year) %in% c(paste0(network_layers$ccode1, "-", network_layers$year), paste0(network_layers$ccode2, "-", network_layers$year))
) 


## Create factor variables for imputation
all.vars <- unique(c(paste0(network_layers$ccode1, "-", network_layers$year), paste0(network_layers$ccode2, "-", network_layers$year)))
all.polity.states <- paste0(polity.nmc.vdem$ccode, "-", polity.nmc.vdem$year)
all.vars[!(all.vars %in% all.polity.states)]


## What will be imputed
data.to.impute <- filter(
  polity.nmc.vdem,
  year <= 2014
) %>% distinct(
  ccode, year, .keep_all = T
)

## Change variable class for imputation
for (i in 1:ncol(data.to.impute))
{
  unique_vars <- unique(data.to.impute[,i])
  unique_vars <- unique_vars[!is.na(unique_vars)]
  
  if (is.numeric(data.to.impute[,i]))
  {
    if (!(colnames(data.to.impute)[i] %in% c("polity", "region")))
    {
      data.to.impute[,i] <- ifelse(data.to.impute[,i] < 0, NA, data.to.impute[,i])  
    }
  }
  if (length(unique_vars) < 10)
  {
    data.to.impute[,i] <- as.factor(as.character(data.to.impute[,i]))
  } 
}

## Change polity to factor
data.to.impute$polity <- factor(
  as.character(data.to.impute$polity),
  levels = c(
    min(data.to.impute$polity, na.rm = T):max(data.to.impute$polity, na.rm = T)
  )
)

## Change democ to factor
data.to.impute$democ <- factor(
  as.character(data.to.impute$democ),
  levels = c(
    min(data.to.impute$democ, na.rm = T):max(data.to.impute$democ, na.rm = T)
  )
)

## Change autoc to factor
data.to.impute$autoc <- factor(
  as.character(data.to.impute$autoc),
  levels = c(
    min(data.to.impute$autoc, na.rm = T):max(data.to.impute$autoc, na.rm = T)
  )
)

## Change polcomp to factor
data.to.impute$polcomp <- factor(
  as.character(data.to.impute$polcomp),
  levels = c(
    min(data.to.impute$polcomp, na.rm = T):max(data.to.impute$polcomp, na.rm = T)
  )
)

## Change xrreg to factor
data.to.impute$xrreg <- factor(
  as.character(data.to.impute$xrreg),
  levels = c(
    min(as.numeric(as.character(data.to.impute$xrreg)), na.rm = T):max(as.numeric(as.character(data.to.impute$xrreg)), na.rm = T)
  )
)

## Change xrcomp to factor
data.to.impute$xrcomp <- factor(
  as.character(data.to.impute$xrcomp),
  levels = c(
    min(as.numeric(as.character(data.to.impute$xrcomp)), na.rm = T):max(as.numeric(as.character(data.to.impute$xrcomp)), na.rm = T)
  )
)

## Change xropen to factor
data.to.impute$xropen <- factor(
  as.character(data.to.impute$xropen),
  levels = c(
    min(as.numeric(as.character(data.to.impute$xropen)), na.rm = T):max(as.numeric(as.character(data.to.impute$xropen)), na.rm = T)
  )
)

## Change xconst to factor
data.to.impute$xconst <- factor(
  as.character(data.to.impute$xconst),
  levels = c(
    min(as.numeric(as.character(data.to.impute$xconst)), na.rm = T):max(as.numeric(as.character(data.to.impute$xconst)), na.rm = T)
  )
)

## Change parreg to factor
data.to.impute$parreg <- factor(
  as.character(data.to.impute$parreg),
  levels = c(
    min(as.numeric(as.character(data.to.impute$parreg)), na.rm = T):max(as.numeric(as.character(data.to.impute$parreg)), na.rm = T)
  )
)

## Change exrec to factor
data.to.impute$exrec <- factor(
  as.character(data.to.impute$exrec),
  levels = c(
    min(as.numeric(as.character(data.to.impute$exrec)), na.rm = T):max(as.numeric(as.character(data.to.impute$exrec)), na.rm = T)
  )
)

## Change exconst to factor
data.to.impute$exconst <- factor(
  as.character(data.to.impute$exconst),
  levels = c(
    min(as.numeric(as.character(data.to.impute$exconst)), na.rm = T):max(as.numeric(as.character(data.to.impute$exconst)), na.rm = T)
  )
)

## Set up data to impute
data.to.impute <- arrange(
  data.to.impute,
  year, ccode, names
)


## Merge id for afterward
data.to.id <- dplyr::select(
  data.to.impute, 
  c(
    id, group, n, ccode, country, names
  )
)

## More merge IDs
data.to.impute <- dplyr::select(
  data.to.impute, 
  -c(
    scode,id, group, n, country
  )
)

## Impute by year
impute_years <- sort(unique(data.to.impute$year))
impute_values <- list()
set.seed(3709)
for (i in impute_years)
{
  ## Imputations are within each only
  temp_impute <- filter(
    data.to.impute, year == i
    ) %>% arrange(
      ccode, names
      ) %>% dplyr::select(
        -c(
          ccode, names
          )
        )
 suppressMessages({
   impute_values[[length(impute_values) + 1]] <- 
     missForest::missForest(
       temp_impute,
       verbose = F,
       variablewise = TRUE
 )}) 
 cat("\n")
}

## Save imputed data
saveRDS(impute_values, file = "data/orig_imputed_values_replication.rds")
impute_values <- readRDS("data/orig_imputed_values_replication.rds")
data.imputed <- data.frame()

for (i in 1:length(impute_values))
{
  data.imputed <- bind_rows(
    data.imputed,
    impute_values[[i]]$x
  )
}


all.data <- bind_cols(data.to.id, data.imputed)
all.data <- all.data[,!(colnames(all.data) %in% c(unique(data.to.id$names)))]

for (i in 7:ncol(all.data))
{
  if (!(colnames(all.data)[i] %in% c("region", "country", "names")))
  {
    all.data[,i] <- as.numeric(as.character(all.data[,i])) 
  }
}

## Export data for analysis
saveRDS(polity.nmc.vdem, file = "data/unimputed_values_replication.rds")
saveRDS(data.imputed, file = "data/imputed_values_replication.rds")
saveRDS(all.data, file = "data/data_for_analysis_replication.rds")


